##### Load libraries
libraries = c("tidyverse", "cowplot", "randomForest", "pROC", "epitools", "glmnet", "nlme", "ggforce", "gggenes", "viridis",#
"ggpubr", "data.table", "ggrepel", "Biostrings", "ggpubr", "pheatmap", "vegan", "sjPlot", "sjlabelled", "sjmisc",
"broom", "ggsci", "ggExtra");
invisible(suppressPackageStartupMessages(lapply(libraries, require, character.only = TRUE)));
options(dplyr.summarise.inform = FALSE)
plot_grid = cowplot::plot_grid
setwd('~/github/hmh/TN-Houston_ScienceAdvances_MS/analysis_files_revision')
#function for reformatting sample names to be consistent across datasets
mcov_reformat<-function(badly_formatted_name) {
MCoVNumber<-regmatches(badly_formatted_name, regexpr("[M,R,S,O]CoV.[0-9]+", badly_formatted_name)) %>% str_remove("-") %>% str_remove("_")
return(MCoVNumber)
}
#all mcov samples we have ever received - reformat sample names to be consistent
mcov_samples_all<-fread("full_lineage_report_20220507.tsv", data.table=F) %>% mutate(MCoVNumber=mcov_reformat(taxon)) %>% filter(startsWith(MCoVNumber, "MCoV")) %>% filter(MCoVNumber!="MCoV30904") #remove one sample with inconsistently formatted duplicates # EXPLAIN!
mcov_samples_all
#run and date info; keep only earliest sample from the same patient
mcov_info<-fread("sample_date_and_run.csv", data.table=F) %>% mutate(COLLECTION_DT=as.Date(COLLECTION_DT, "%m/%d/%y"), MCoVNumber=str_remove(mcov_id, "-")) %>% arrange(COLLECTION_DT) %>% filter(!duplicated(PatientID))
#24 samples present in two runs; disregard first run, NOT FOR TECHNICAL REPLICATES, REDO SEQUENCING (ask whole group Wes, randy paul)
dup65.66<-c("MCoV-55544_S733", "MCoV-55545_S734", "MCoV-55546_S735", "MCoV-55547_S736", "MCoV-55548_S737", "MCoV-55549_S738", "MCoV-55550_S739", "MCoV-55551_S740", "MCoV-55552_S741", "MCoV-55553_S742", "MCoV-55554_S743", "MCoV-55555_S744", "MCoV-55556_S745", "MCoV-55557_S746", "MCoV-55558_S747", "MCoV-55559_S748", "MCoV-55560_S749", "MCoV-55561_S750", "MCoV-55562_S751", "MCoV-55563_S752", "MCoV-55564_S753", "MCoV-55565_S754", "MCoV-55566_S755", "MCoV-55567_S756")
#samples we received that we're interested in (December 2020-November 2021)
mcov_samples_1<-mcov_samples_all %>% filter(!taxon %in% dup65.66) %>% filter(MCoVNumber %in% mcov_info$MCoVNumber) %>% left_join(mcov_info)
## Joining, by = "MCoVNumber"
#all samples sequenced in each run (including those from outside of this study period) -- sometimes relevant for run QC purposes
runs_all_samples<- fread("run_samples.csv", data.table = F) %>% mutate(MCoVNumber=str_remove('Sample ID',"-"), run=Run) %>% select(run, MCoVNumber)
#summary stats on coverage of each sample; drop the run 65 dudplicates and join coverage info to main dataset
d1 = fread('coverage_levels_20220507.csv', data.table = F) %>% filter(duplicated(samplename)) %>% pull(samplename)
d2 = fread('coverage_levels_20220507.csv', data.table = F) %>% filter(samplename %in% d1) %>% filter(!duplicated(samplename)) %>% pull(filename)
coverage_levels<-fread('coverage_levels_20220507.csv', data.table = F) %>% filter(!filename %in% d2) %>% filter(samplename!="MCoV30904") %>% select(-1)
#connect all info
mcov_samples<-mcov_samples_1 %>% select(MCoVNumber, lineage, scorpio_call, qc_status, COLLECTION_DT, INSTRUMENT, INSTRUMENT_RESULT, run=run_group) %>% left_join(coverage_levels, by=c("MCoVNumber"="samplename"))
mcov_samples_with_ct<-mcov_samples %>% filter(INSTRUMENT_RESULT<50) %>% mutate(CT=INSTRUMENT_RESULT)
#what's the relationship between CT value and read coverage?
coverage_all<-mcov_samples_with_ct %>% ggplot(aes(x=CT, y=median_coverage)) + geom_point(alpha=0.07) + theme_bw() + xlab("Ct value") + ylab("Sample median depth")
#how does coverage in high-CT samples compare with the rest of them?
coverage_ct_cat<-mcov_samples_with_ct %>% mutate(sample_ct=if_else(CT>=40, "CT>=40", "CT<40")) %>%
ggplot(aes(x=sample_ct, y=median_coverage)) + geom_point(alpha=0.2, position=position_jitter(width=0.25)) + geom_boxplot(color="red", alpha=0) + theme_bw() + xlab("Ct value") + ylab("Sample median depth")
#see that CT>=40 samples tend to have extremely low coverage, but some outliers
#SUPP FIG 12
#coverage_pre_filtering<-plot_grid(coverage_all, coverage_ct_cat)
#coverage_pre_filtering
#are there any runs where high-CT samples have unusually high coverage? treat CT>40 samples as negative controls and eliminate runs where their coverage is not different from those of the rest of the samples
test_group<-mcov_samples_with_ct %>% mutate(is_neg=if_else(CT>=40,1,0)) %>% group_by(run) %>% mutate(n_negs=sum(is_neg)) %>% filter(n_negs>=3) %>% ungroup() %>% mutate(sampletype=if_else(is_neg==1, "negctrl","sample"))
runs_to_drop1 = test_group %>% group_by(run) %>% summarise(t_test_p=t.test(fraction_1000x_coverage~sampletype)$p.value) %>% arrange(desc(t_test_p)) %>% filter(t_test_p>0.01) %>% pull(run)
runs_to_drop2 = test_group %>% group_by(run) %>% summarise(t_test_p=t.test(median_coverage~sampletype)$p.value) %>% arrange(desc(t_test_p)) %>% filter(t_test_p>0.01) %>% pull(run)
############
runs_to_drop<-union(runs_to_drop1, runs_to_drop2)
runs_to_drop
## [1] "Run_20" "Run_89" "Run_21" "Run_58" "Run_76" "Run_71" "Run_75" "Run_86"
## [9] "Run_74" "Run_70" "Run_78" "Run_62" "Run_13" "Run_85" "Run_65" "Run_34"
## [17] "Run_87" "Run_11" "Run_83" "Run_16" "Run_77" "Run_81"
mcov_samples_with_ct %>% pull(run) %>% unique()
## [1] "Run_11" "Run_12" "Run_15" "Run_13" "Run_16" "Run_18" "Run_19" "Run_20"
## [9] "Run_21" "Run_22" "Run_23" "Run_24" "Run_25" "Run_26" "Run_27" "Run_28"
## [17] "Run_29" "Run_30" "Run_31" "Run_35" "Run_32" "Run_33" "Run_34" "Run_37"
## [25] "Run_36" "Run_39" "Run_40" "Run_41" "Run_43" "Run_44" "Run_46" "Run_48"
## [33] "Run_49" "Run_51" "Run_52" "Run_53" "Run_57" "Run_58" "Run_59" "Run_61"
## [41] "Run_62" "Run_63" "Run_64" "Run_65" "Run_66" "Run_68" "Run_69" "Run_70"
## [49] "Run_71" "Run_72" "Run_73" "Run_74" "Run_75" "Run_76" "Run_77" "Run_78"
## [57] "Run_79" "Run_80" "Run_81" "Run_82" "Run_83" "Run_85" "Run_86" "Run_87"
## [65] "Run_88" "Run_89" "Run_90" "Run_14" "Run_17"
test_group %>% group_by(run, sampletype) %>% summarize(counts = n()) %>% filter(run %in% runs_to_drop) %>% nrow()
## [1] 44
mcov_samples_with_ct %>% pull(run) %>% unique()
## [1] "Run_11" "Run_12" "Run_15" "Run_13" "Run_16" "Run_18" "Run_19" "Run_20"
## [9] "Run_21" "Run_22" "Run_23" "Run_24" "Run_25" "Run_26" "Run_27" "Run_28"
## [17] "Run_29" "Run_30" "Run_31" "Run_35" "Run_32" "Run_33" "Run_34" "Run_37"
## [25] "Run_36" "Run_39" "Run_40" "Run_41" "Run_43" "Run_44" "Run_46" "Run_48"
## [33] "Run_49" "Run_51" "Run_52" "Run_53" "Run_57" "Run_58" "Run_59" "Run_61"
## [41] "Run_62" "Run_63" "Run_64" "Run_65" "Run_66" "Run_68" "Run_69" "Run_70"
## [49] "Run_71" "Run_72" "Run_73" "Run_74" "Run_75" "Run_76" "Run_77" "Run_78"
## [57] "Run_79" "Run_80" "Run_81" "Run_82" "Run_83" "Run_85" "Run_86" "Run_87"
## [65] "Run_88" "Run_89" "Run_90" "Run_14" "Run_17"
test_group %>% group_by(run, sampletype) %>% summarize(counts = n()) %>% filter(sampletype=="negctrl") %>% arrange(counts) %>% mutate(dropped = run %in% runs_to_drop) %>%
ggplot(aes(dropped,counts, label=run)) + geom_boxplot() + geom_point() + geom_text_repel() + theme_pubr()
## Warning: The following aesthetics were dropped during statistical transformation: label
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning: ggrepel: 28 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
runs_kept = mcov_samples_with_ct$run[!mcov_samples_with_ct$run %in% runs_to_drop] %>% unique()
# for (i in runs_kept) {
# f = mcov_samples_with_ct %>% filter(run == i) %>% ggplot(aes(INSTRUMENT_RESULT, fraction_1000x_coverage)) + geom_point() + ggtitle(i) + theme(plot.title = element_text(size = 40, face = "bold"))
# print(f)
# }
f1 = mcov_samples_with_ct %>% ggplot(aes(INSTRUMENT_RESULT, fraction_1000x_coverage)) + geom_point(shape=".", alpha = 0.5) + theme_pubr() + theme(plot.title = element_text(size = 40, face = "bold")) + geom_vline(xintercept=40, color = "red")
ggMarginal(f1)
f2 = mcov_samples_with_ct %>% ggplot(aes(INSTRUMENT_RESULT, log10(median_coverage))) + geom_point(shape=".", alpha = 0.5) + theme_pubr() + theme(plot.title = element_text(size = 40, face = "bold")) + geom_vline(xintercept=40, color = "red")
ggMarginal(f2)
f3 = mcov_samples_with_ct %>% filter(run %in% runs_kept) %>% ggplot(aes(INSTRUMENT_RESULT, log10(median_coverage))) + geom_point(shape=".", alpha = 0.5) + theme_pubr() + theme(plot.title = element_text(size = 40, face = "bold")) + geom_vline(xintercept=40, color = "red")
ggMarginal(f3)
f4 = mcov_samples_with_ct %>% ggplot(aes(INSTRUMENT_RESULT, log10(median_coverage))) + geom_point(shape=".", alpha = 0.5) + theme_pubr() + theme(plot.title = element_text(size = 40, face = "bold")) + geom_vline(xintercept=40, color = "red")
ggMarginal(f4)
tmp = data.table(mcov_samples_with_ct %>% filter(CT>35))[ , invisible(cor.test(fraction_1000x_coverage, -CT, method="spearman")[-2]), by=run] %>% select(run, p.value, estimate) %>%
mutate; tmp
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
## Warning in cor.test.default(fraction_1000x_coverage, -CT, method = "spearman"):
## Cannot compute exact p-value with ties
tmp = tmp %>% mutate(abnormal = run %in%
c("Run_20","Run_89","Run_21","Run_58","Run_76","Run_71","Run_75",
"Run_86","Run_74","Run_70","Run_78","Run_62","Run_13","Run_85",
"Run_65","Run_34","Run_87","Run_11","Run_83","Run_16","Run_77",
"Run_81"))
ggplot(tmp, aes(estimate, -log10(p.value), color = abnormal,
label = gsub("Run_", "", run))) +
geom_point() + geom_text_repel()+ theme_pubr()
## Warning: ggrepel: 29 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
tmp %>% arrange(-p.value)
# runs 2
### plot boxplot of the rns
stmp = mcov_samples_with_ct %>% mutate(abnormal = run %in%
c("Run_20","Run_89","Run_21","Run_58","Run_76","Run_71","Run_75",
"Run_86","Run_74","Run_70","Run_78","Run_62","Run_13","Run_85",
"Run_65","Run_34","Run_87","Run_11","Run_83","Run_16","Run_77",
"Run_81")) %>% mutate(run = gsub("Run_", "", run)) %>% mutate(is_neg=if_else(CT>=40,1,0))
ggplot(stmp, aes(run, fraction_1000x_coverage, fill = abnormal)) + geom_boxplot() +
geom_point(data = stmp %>% filter(is_neg == T),
aes(run, fraction_1000x_coverage), color = "red", alpha = .9, shape = 1) + scale_fill_grey(start=1, end=0.5) +
theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=8))
ggplot(stmp, aes(run, log10(median_coverage), fill = abnormal)) + geom_boxplot() +
geom_point(data = stmp %>% filter(is_neg == T),
aes(run, log10(median_coverage)), color = "red", alpha = .9, shape = 1) + scale_fill_grey(start=1, end=0.5) +
theme(axis.text.x = element_text(angle = 90, hjust = 1), text = element_text(size=8))
stmp
#nextclade QC
nc<-fread("houston_nextclade.tsv", sep='\t', data.table = F) %>% mutate(MCoVNumber=regmatches(seqName, regexpr("[M,R,S,O]CoV.[0-9]+", seqName)) %>% str_remove("-") %>% str_remove("_")) %>% filter(!duplicated(MCoVNumber))
nextclade_bad_samples<-nc %>% filter(qc.overallStatus %in% c("bad")) %>% pull(MCoVNumber)
#drop bad runs and samples that don't pass pangolin QC or nextclade QC
mcov_samples_filtered<-mcov_samples %>% filter(!run %in% runs_to_drop) %>%
filter(qc_status=="pass") %>% filter(!MCoVNumber %in% nextclade_bad_samples) %>% filter(scorpio_call!="Omicron (BA.1-like)") %>%
##### #main coverage criterion for fair comparisons: X depth over Y percent of the genome
filter(fraction_100x_coverage>=0.98) %>% droplevels()
#what's the distribution of CT values?
ct_distribution_after_qc<-mcov_samples_filtered %>% filter(INSTRUMENT_RESULT<50) %>% ggplot(aes(x=INSTRUMENT_RESULT)) + geom_histogram(binwidth=1) + theme_bw() + xlab("Ct value") + labs(caption="After exclusion criteria")
nrow(mcov_samples_filtered)
## [1] 15389
#How did exclusion criteria change distribution of sample Ct values?
ct_distribution_after_qc<-mcov_samples_filtered %>% filter(INSTRUMENT_RESULT<50) %>% ggplot(aes(x=INSTRUMENT_RESULT)) + geom_histogram(binwidth=1) + theme_bw() + xlab("Ct value")
ct_distribution_before_qc<-mcov_samples %>% filter(INSTRUMENT_RESULT<50) %>% ggplot(aes(x=INSTRUMENT_RESULT)) + geom_histogram(binwidth=1) + theme_bw() + xlab("Ct value") + labs(caption="Before exclusion criteria")
#ct_inclusion<-plot_grid(ct_distribution_before_qc, ct_distribution_after_qc, nrow=2)
#ct_inclusion
ct_distribution_after_qc<-mcov_samples_filtered %>% filter(INSTRUMENT_RESULT<50)
ct_distribution_before_qc<-mcov_samples %>% filter(INSTRUMENT_RESULT<50)
ggplot(aes(x=INSTRUMENT_RESULT), data = ct_distribution_before_qc) + geom_histogram(binwidth = 1, fill = "grey") +
geom_histogram(data = ct_distribution_after_qc, binwidth=1) + theme_pubr() + xlab("Ct value")
#nucleotide positions of all primers used in dataset; will exclude
fread("./nCoV-2019.artic_v3.primer.txt", sep="\t", header=FALSE, data.table=F) %>% select(start=V2, end=V3) -> primers
primer_positions_v3<-as.numeric()
for (i in 1:nrow(primers)){
primer_positions_v3<-c(primer_positions_v3, primers[i,]$start:primers[i,]$end)
}
fread("./nCoV-2019.artic_v4.primer.bed", sep="\t", header=FALSE, data.table = F) %>% select(start=V2, end=V3) -> primers
primer_positions_v4<-as.numeric()
for (i in 1:nrow(primers)){
primer_positions_v4<-c(primer_positions_v4, primers[i,]$start:primers[i,]$end)
}
fread("./V4.1.bed", sep="\t", header=FALSE, data.table=F) %>% select(start=V2, end=V3) -> primers
primer_positions_v4.1<-as.numeric()
for (i in 1:nrow(primers)){
primer_positions_v4.1<-c(primer_positions_v4.1, primers[i,]$start:primers[i,]$end)
}
primer_positions_all<-c(primer_positions_v3, primer_positions_v4, primer_positions_v4.1) %>% unique()
#reference genome with nucleotide positions of genes
genes<-fread("ntpos_gene_update.csv", data.table=F)
gene_names<-genes %>% pull(gene_id) %>% unique()
genes$gene_id<-factor(genes$gene_id, levels=gene_names)
### Update this if you change sample inclusion criteria
#minor_variant_sites_allLevels <- fread("minor_sites_100x_all_20220507.csv", data.table=F) %>% mutate(MCoVNumber=regmatches(name, #regexpr("[M,R,S,O]CoV.[0-9]+", name)) %>% str_remove("-") %>% str_remove("_")) %>% filter(MCoVNumber %in% #mcov_samples_filtered$MCoVNumber)
#write.csv(minor_variant_sites_allLevels, "minor_sites_workingsamples_100-98minimum.csv", row.names=FALSE)
### Update this if change the thresholds for counting minor variants
#file was already filtered to sites with minimum 100 reads depth and A,C,T,G minor variant present and binomial significance check passed. Further filtering:
#depth_at_site<-100
#minor_frequency<-0.01
#total_minor_reads<-50
#
#minor_variant_sites_threshold_applied<-fread("minor_sites_workingsamples_100-98minimum.csv", data.table = F) %>%
# filter(totalcount>=depth_at_site) %>%
# filter(!ntpos %in% 1:265) %>% filter(!ntpos>29674) %>% #don't include 5' and 3' UTR
# filter(!ntpos %in% primer_positions_all) %>% #don't include primer binding sites
# filter(major %in% c("A","C","T","G")) %>% #don't want minor variants at consensus deletion sites
# filter(minorfreq>=minor_frequency) #%>%
# #filter(minorfreq*totalcount>=total_minor_reads)
#write.csv(minor_variant_sites_threshold_applied, 'minor_variants_filtered_100x0.01_50.csv')
#load file that was generated/saved above
minor_variant_sites_threshold<-fread('minor_variants_filtered_100x0.01_50.csv', data.table=F)
n_var<-minor_variant_sites_threshold %>% group_by(MCoVNumber) %>% tally() %>% arrange(desc(n)) #this tally doesn't include any samples with 0 variants, so need to join to original list
samples_n_var<-mcov_samples_filtered %>% left_join(n_var) %>% arrange(COLLECTION_DT) %>% mutate(n_var=tidyr::replace_na(n,0))
## Joining, by = "MCoVNumber"
#what's the distribution of minor variant richness?
###
n_var_select<-samples_n_var %>% ggplot(aes(x=n_var)) + geom_histogram(binwidth=5) + theme_pubr() + xlab("No. minor variants in sample") + ylab("No. samples") + scale_y_continuous(trans='log1p', breaks=c(1, 10, 100, 1000, 5000)); n_var_select
#FIGURE 1C
#what's the relationship between minor variant richness and Ct value?
n_var_select_ct<-samples_n_var %>% filter(INSTRUMENT_RESULT<50) %>% ggplot(aes(x=INSTRUMENT_RESULT, y=n_var)) + geom_point(color = "black", shape = 1, alpha = 1/8) + theme_pubr() + scale_x_continuous(breaks = seq(0,40,by = 5)) +
xlab("Ct value") + ylab("No. minor variants")
n_var_select_ct
ggMarginal(n_var_select_ct)
patient_data<-fread("sample_and_patient_data.csv",data.table=F) %>% mutate(MCoVNumber=str_remove(mcov_id, "-")) %>% select(-COLLECTION_DT) %>% inner_join(samples_n_var)
## Joining, by = c("INSTRUMENT", "INSTRUMENT_RESULT", "MCoVNumber")
test_ct = patient_data %>% filter(INSTRUMENT_RESULT<40) %>% ggplot(aes(x=INSTRUMENT_RESULT, y=n_var, color = as.factor(Admitted_YN))) + geom_point(shape = 1, alpha = 1/8) + geom_density_2d(alpha = 1/2) + scale_y_continuous(trans = "log2", breaks = c(0,1,2,4,8,16,32,64, 128, 256))+
scale_x_continuous(breaks = seq(0,40,by = 5)) +
theme_pubr() + xlab("Ct value") + ylab("# minor variants") + geom_smooth(method=lm)
ggMarginal(test_ct, groupColour = TRUE, groupFill = TRUE, type = "violin", draw_quantiles = c(.5))
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 459 rows containing non-finite values (`stat_density2d()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 459 rows containing non-finite values (`stat_smooth()`).
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 459 rows containing non-finite values (`stat_density2d()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 459 rows containing non-finite values (`stat_smooth()`).
## Warning: `position_dodge()` requires non-overlapping x intervals
## Warning: Removed 459 rows containing non-finite values (`stat_ydensity()`).
## Warning: `position_dodge()` requires non-overlapping x intervals
n_var_select_ct
#FIGURE 1B
#n_var_ct_zoom<-n_var_select_ct + annotate("rect", xmin=5, xmax=26, ymin=0, ymax=Inf, alpha=0.2, fill="gray") + theme_pubr
#n_var_ct_zoom
# test_ct = patient_data %>% filter(INSTRUMENT_RESULT<40) %>% mutate(vaccine = if_else(Vaccine_Status=="No vaccine", 0, 1)) %>% ggplot(aes(x=INSTRUMENT_RESULT, y=n_var, color = as.factor(vaccine))) + geom_point(shape = 1, alpha = 1/8) + geom_density_2d(alpha = 1/2) + scale_y_continuous(trans = "log2", breaks = c(0,1,2,4,8,16,32,64, 128, 256))+
# scale_x_continuous(breaks = seq(0,40,by = 5)) +
# theme_pubr() + xlab("Ct value") + ylab("No. minor variants") + geom_smooth(method=lm)
# ggMarginal(test_ct, groupColour = TRUE, groupFill = TRUE, type = "violin", draw_quantiles = c(.5))
# ggdraw(n_var_select + theme_half_open(12)) +
# draw_plot(n_var_ct_zoom, x=0.3, y=0.3, width=0.7, height=0.7) +
# draw_plot_label(
# c("a", "b"),
# c(0, 0.3),
# c(1, 1),
# size = 12
# )
samples_n_var %>% pull(n_var) %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 3.00 6.00 18.94 16.00 452.00
all_replicates_table_filt<-fread("replicated_samples.csv", data.table=F) %>% filter(MCoVNumber %in% samples_n_var$MCoVNumber)
minors_table<-all_replicates_table_filt %>% filter(major.original %in% c("A","C","T","G")) %>% filter(minor.original %in% c("A","C","T","G")) %>%
filter(totalcount.original>=100 & minorfreq.original>=0.01 & totalcount.original*minorfreq.original>=50 & tolower(binocheck.original)!="false") %>% filter((!ntpos %in% primer_positions_all)) %>% filter(!ntpos %in% 1:265) %>% filter(!ntpos>29674)
#was the same minor variant found in the second replicate? if not, set minor frequency to 0 in second rep
minors_table<-minors_table %>% mutate(minorfreq.reseq=if_else(minor.original==minor.reseq, minorfreq.reseq, 0)) %>% mutate(detected_minor_in_repl=if_else(minor.original==minor.reseq,"yes","no"))
#how well are minor variants recovered in samples with different Ct values?
minors_table<-minors_table %>% left_join(select(mcov_samples, MCoVNumber, CT=INSTRUMENT_RESULT)) %>%
mutate(sampleCT_bin=case_when(CT<26 ~ "below 26",
CT>=26&CT<=35 ~"CT 26-35",
CT>35&CT<50 ~"greater than 35",
CT>100~"unknown", #aptima instrument uses RLU not CT
is.na(CT) ~ "unknown"))
## Joining, by = "MCoVNumber"
rep_ct<-minors_table %>% ggplot(aes(x=minorfreq.original, y=minorfreq.reseq, color=detected_minor_in_repl)) + scale_color_manual(values=c("red","black")) + geom_point(alpha=0.5) +
facet_wrap(~sampleCT_bin) + theme_bw() + labs(x="MAF in replicate 1", y="MAF in replicate 2") + theme(legend.position="bottom")
rep_ct
#how well are minor variants recovered in samples with different median coverage?
rep_depth<-minors_table %>% left_join(coverage_levels, by=c("MCoVNumber"="samplename")) %>% mutate(coverage_bin=cut(median_coverage, 4)) %>% ggplot(aes(x=minorfreq.original, y=minorfreq.reseq, color=detected_minor_in_repl)) + scale_color_manual(values=c("red","black")) + geom_point(alpha=0.5) +
facet_wrap(~coverage_bin) + theme_bw() + labs(x="MAF in replicate 1", y="MAF in replicate 2") + theme(legend.position="none", axis.text.x = element_text(color=c(1,0,1,0))) + labs(caption="Reproducibility in samples with different median depths")
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
#in the range of Ct values/coverage observed here, reproducibility seems more associated with Ct than with coverage
#what's the distribution of depth/MAF in reproducible vs. non-reproducible minor variants?
rep_depth_freq<-minors_table %>% ggplot(aes(x=totalcount.original, y=minorfreq.original)) + geom_point(alpha=0.5) + facet_grid(detected_minor_in_repl~.) + theme_bw() + theme(axis.text.x = element_text(color=c(1,0,1,0))) + xlab("Seq depth in rep 1") + ylab("MAF in rep 1") + labs(caption="Reproducibility at sites with different depth and MAF")
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
#depth and frequency of minor variants is also not very different between reproducible and non-reproducible variants
rep_mutations<-minors_table %>% filter(detected_minor_in_repl=="yes")
nonrep_mutations<-minors_table %>% filter(detected_minor_in_repl=="no")
a<-table(rep_mutations$major.original, rep_mutations$minor.original) %>% data.frame() %>% ggplot(aes(x=Var1, y=Var2, fill=Freq)) + geom_tile(colour = "black") + # grid colour
scale_fill_gradient(low = "white",
high = "steelblue") +
theme_minimal() + labs(fill = "Number",
x = "Consensus allele",
y = "Minor allele", caption="Reproducible minor variants")
b<-table(nonrep_mutations$major.original, nonrep_mutations$minor.original) %>% data.frame() %>% ggplot(aes(x=Var1, y=Var2, fill=Freq)) + geom_tile(colour = "black") +
scale_fill_gradient(low = "white",
high = "steelblue") +
theme_minimal() + labs(fill = "Number",
x = "Consensus allele",
y = "Minor allele", caption="Non-reproducible minor variants")
rep_nucleotides<-cowplot::plot_grid(a, b, ncol=2)
#SUPP FIG 8
rep_nucleotides
rep_indiv_samples<-minors_table %>% ggplot(aes(x=minorfreq.original, y=minorfreq.reseq, color=detected_minor_in_repl)) + geom_point(alpha=0.5) + scale_color_manual(values=c("red","black")) + facet_wrap(CT~MCoVNumber) + theme_bw() + xlab("MAF in replicate 1") + ylab("MAF in replicate 2") + theme(legend.position="none",axis.text.x = element_text(color=c(1,0,1,0)))
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
#SUPP FIG 1
cowplot::plot_grid(cowplot::plot_grid(rep_ct, cowplot::plot_grid(rep_depth, rep_depth_freq, ncol=2, rel_widths=c(1.2,1), labels=c("b","c")), nrow=2, labels=c("a",NA)), rep_indiv_samples, labels=c(NA, "d"), rel_widths=c(1.1,1))
## Warning: Removed 1 rows containing missing values (`geom_text()`).
## Removed 1 rows containing missing values (`geom_text()`).
Will limit analyses to samples with CT<26, where we are more confident in reproducibility of minor variant
samples_n_var %>% filter(INSTRUMENT_RESULT<26) %>% ggplot(aes(x=INSTRUMENT_RESULT, y=n_var)) + geom_point(alpha=0.5) + geom_smooth() + theme_bw() + xlab("Ct value") + ylab("No. minor variants")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
#slight association with Ct value still exists, but in this range assumption will be that this is more likely to do with real biological factors
samples_n_var %>% filter(INSTRUMENT_RESULT<26) %>% pull(n_var) %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 2.00 5.00 10.79 10.00 379.00
samples_to_analyze<-samples_n_var %>% filter(INSTRUMENT_RESULT<26) %>% pull(MCoVNumber)
lineages_figure<-mcov_samples_filtered %>% filter(MCoVNumber %in% samples_to_analyze) %>% mutate(variant=case_when(startsWith(scorpio_call,"Delta")~"Delta",
startsWith(scorpio_call,"Alpha")~"Alpha",
!(startsWith(scorpio_call,"Delta")|startsWith(scorpio_call,"Alpha")) ~ "other lineages")) %>%
mutate(variant=if_else(lineage=="B.1.2","B.1.2",variant)) %>%
ggplot(aes(x=COLLECTION_DT)) + geom_bar(aes(fill=variant)) + theme_bw() + scale_fill_manual(values=c("#00A08A", "#F2AD00", "#F98400", "#5BBCD6")) + xlab("Collection date") + ylab("No. samples") + ggtitle(paste0('Final sample set, CT<26, n=',length(samples_to_analyze))) + scale_x_date(minor_breaks="1 month")
#SUPP FIG 2
lineages_figure
samples_n_var %>% filter(INSTRUMENT_RESULT<26) %>% pull(n_var) %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 2.00 5.00 10.79 10.00 379.00
#plot_grid(coverage_pre_filtering, ct_inclusion, ncol=2, labels=c("a","b"))
minor_sites_lowct<-minor_variant_sites_threshold %>% left_join(mcov_samples_filtered) %>% filter(INSTRUMENT_RESULT<26)
## Joining, by = "MCoVNumber"